# set directory
setwd("~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code")

# load packages
library(dada2)
library("phyloseq")
library(ggplot2)
library(tidyverse)
library(fantaxtic)
library(DECIPHER)
library(phangorn)
library(reshape)
library(janitor)
library(multcomp)

# load physeq object
load("~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code/phyloseq final variables.RData")

# add organic soil C:N gradient 
soil.data <- read.csv("soil.data.csv")
sample_data(physeq)$organic.soil.C.N <- soil.data$organic.soil.C.N
sample_data(physeq)$organic.soil.GWC <- soil.data$organic.soil.GWC
sample_data(physeq)$carcass.deposition <- soil.data$carcass.deposition
sample_data(physeq)$ph <- soil.data$ph
sample_data(physeq)$decomposing.carcass <- soil.data$decomposing.carcass

################################
# DISTANCE FROM BANK #
################################

# make sure to remove bog sample first!!
physeq_nobog <- subset_samples(physeq, Transect != "Bog")

# plot Hansen stream specific alpha diversity metrics (separately due to space) by distance
p <- plot_richness(physeq_nobog, color="Distance", x="Distance", measures=c("Observed"))

# boxplot
p + geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)

# change distance 5 to 6 for consistency
p$data$Distance[67] <- 6

# or bar graph of means with CI for each category computes the mean confidence interval 
# for each category based on a bootstrapping method (a simulation technique to avoid 
# making an assumption about a normal distribution)
ggplot(data = p$data, aes(Distance, value, group=Distance,  fill="lightblue", fill=Distance)) + 
  stat_summary(fun.data=mean_sdl, geom="bar", position=position_dodge(width=0.95)) + 
  facet_wrap(~variable, ncol=3, scales="free") + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", 
  position=position_dodge(width=0.95), width=0.3) + theme_bw() + 
  scale_y_continuous(limits=c(0,3.5),expand = c(0, 0)) + theme( strip.text.x = element_blank(),
   legend.position="none") + xlab("Distance (m)") + ylab("Shannon Diversity")

# get numbers for alpha diversity metrics at Hansen
alpha.diversity <- estimate_richness(physeq_nobog, measures=c("Observed","Shannon", "Simpson"))
head(alpha.diversity)

# combine phyloseq object with alpha diversity metrics 
data <- cbind(sample_data(physeq_nobog), alpha.diversity)

# add depth as potential covariate
data$Depth <- sample_sums(physeq_nobog)

# try linear model (assumptions likely not met)
mod1 <- lm(Observed~Distance, data=data)
mod2 <- lm(Shannon~Distance, data=data)
mod3 <- lm(Simpson~Distance, data=data)
summary(mod1)
summary(mod2)
summary(mod3)
plot(mod1)

# Observed, Shannon index all decrease with distance

# non-parametric regression is probably better, try the Kendall-Theil regression
library(mblm)
mod1 = mblm(Observed ~ Distance,data=data)
mod2 = mblm(Shannon ~ Distance,data=data)
mod3 = mblm(Simpson ~ Distance,data=data)
summary(mod1)
summary(mod2)
summary(mod3)
mod2$coefficients[2] <- -0.005
mod2$coefficients[1] <- 3.2

# Observed, Shannon and Simpson all decrease with distance!
# lets' visualize this!

pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod1, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

ggplot(NULL, aes(x,y)) + 
  geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)+
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.1*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.1*upr), linetype="dotted")+ 
  theme(panel.background = element_blank(),panel.border = element_rect(colour = "black", 
  fill = NA, linewidth = 1.2)) + labs(y = "Species Richness", x="Distance")

################################
# DISTANCE FROM BANK AT HANSEN # 
################################

# filter phyloseq object by Hansen stream
physeq_hansen <- subset_samples(physeq, Stream=="Hansen")

# plot Hansen stream specific alpha diversity metrics (separately due to space) by distance
p <- plot_richness(physeq_hansen, color="Distance", x="Distance", measures=c("Shannon"))

# boxplot
p + geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)

# put distance 5 with 6 for consistency
p$data$Distance[37] <- 6

# or bar graph of means with CI for each category computes the mean confidence interval 
# for each category based on a bootstrapping method (a simulation technique to avoid 
# making an assumption about a normal distribution)
ggplot(data = p$data, aes(Distance, value, group=Distance,  fill="lightblue", fill=Distance)) + 
  stat_summary(fun.data=mean_sdl, geom="bar", position=position_dodge(width=0.95)) + 
  facet_wrap(~variable, ncol=3, scales="free") + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(width=0.95), width=0.3)

# get numbers for alpha diversity metrics at Hansen
alpha.diversity <- estimate_richness(physeq_hansen, measures=c("Observed","Shannon", "Simpson"))
head(alpha.diversity)

# combine phyloseq object with alpha diversity metrics 
data <- cbind(sample_data(physeq_hansen), alpha.diversity)

# add depth as potential covariate
data$Depth <- sample_sums(physeq_hansen)

# try linear model (assumptions likely not met)
mod1 <- lm(Observed~Distance, data=data)
mod2 <- lm(Shannon~Distance, data=data)
mod3 <- lm(Simpson~Distance, data=data)
summary(mod1)
summary(mod2)
summary(mod3)
plot(mod1)

# non-parametric regression is probably better, try the Kendall-Theil regression
library(mblm)
mod1 = mblm(Observed ~ Distance,data=data)
mod2 = mblm(Shannon ~ Distance,data=data)
mod3 = mblm(Simpson ~ Distance,data=data)
summary(mod1)
summary(mod2) # SIGNIFICANT, Shannon diversity increases with distance
summary(mod3)

# visualize this
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod2, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

ggplot(NULL, aes(x,y)) + 
  geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)+
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.05*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.05*upr), linetype="dotted")+ 
  theme(panel.background = element_blank(),panel.border = element_rect(colour = "black", 
  fill = NA, linewidth = 1.2)) + labs(y = "Shannon Diversity", x="Distance")+ylim(c(1.5,4))


# Shannon index increases with distance at Hansen

# see if differences in distance by side?
# filter values of each diversity metric for testing
hansen_left <- data[which(data$Side == "SL"),]
hansen_right <- data[which(data$Side == "SR"),]

# observed 
mod4 = mblm(Observed ~ Distance, data=hansen_left)
mod5 = mblm(Observed ~ Distance, data=hansen_right)
summary(mod4)
summary(mod5) # SIGNIFICANT
# Species richness increases with distance on Hansen right bank

mod6 = mblm(Shannon ~ Distance, data=hansen_left)
mod7 = mblm(Shannon ~ Distance, data=hansen_right)
summary(mod6)
summary(mod7) # SIGNIFICANT
# Shannon index increases with distance on Hansen right bank

mod8 = mblm(Simpson ~ Distance, data=hansen_left)
mod9 = mblm(Simpson ~ Distance, data=hansen_right)
summary(mod8)
summary(mod9)
# no difference

# visualize this

# filter phyloseq object by right or left side of Hansen stream
physeq_hansen_right <- subset_samples(physeq_hansen, Side=="SR")
physeq_hansen_left <- subset_samples(physeq_hansen, Side=="SL")

# plot Hansen stream specific alpha diversity metrics (separately due to space) by distance
p <- plot_richness(physeq_hansen_left, color="Distance", x="Distance", measures=c("Shannon"))

# boxplot
p + geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)

# put distance 5 with 6 for consistency (only for right bank!)
p$data$Distance[17] <- 6

# or bar graph of means with CI for each category computes the mean confidence interval 
# for each category based on a bootstrapping method (a simulation technique to avoid 
# making an assumption about a normal distribution)
ggplot(data = p$data, aes(Distance, value, group=Distance,  fill="lightblue", fill=Distance)) + 
  stat_summary(fun.data=mean_sdl, geom="bar", position=position_dodge(width=0.95)) + 
  facet_wrap(~variable, ncol=3, scales="free") + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", 
  position=position_dodge(width=0.95), width=0.3) + ylab("Shannon Diversity") + theme_bw()+
  scale_y_continuous(limits=c(0,4),expand = c(0, 0)) + theme( strip.text.x = element_blank(),
  legend.position="none") + xlab("Distance (m)")

# boxplot with model prediction
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod6, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

ggplot(NULL, aes(x,y)) + 
  geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)+
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.1*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.1*upr), linetype="dotted")+ 
  theme(panel.background = element_blank(),panel.border = element_rect(colour = "black", 
  fill = NA, linewidth = 1.2)) + labs(y = "Shannon Diversity", x="Distance")

################################
# DISTANCE FROM BANK AT HAPPY # 
################################

# make sure to remove bog sample first!!
physeq_nobog <- subset_samples(physeq, Transect !="Bog")

# filter phyloseq object by Happy stream
physeq_happy <- subset_samples(physeq_nobog, Stream=="Happy")

# plot Hansen stream specific alpha diversity metrics (separately due to space) by distance
p <- plot_richness(physeq_happy, color="Distance", x="Distance", measures=c("Observed"))

# boxplot
p + geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)

# or bar graph of means with CI for each category computes the mean confidence interval 
# for each category based on a bootstrapping method (a simulation technique to avoid 
# making an assumption about a normal distribution)
ggplot(data = p$data, aes(Distance, value, group=Distance,  fill="lightblue", fill=Distance)) + 
  stat_summary(fun.data=mean_sdl, geom="bar", position=position_dodge(width=0.95)) + 
  facet_wrap(~variable, ncol=3, scales="free") + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", 
  position=position_dodge(width=0.95), width=0.3) + theme_bw() + 
  scale_y_continuous(limits=c(0,300),expand = c(0, 0)) + theme( strip.text.x = element_blank(),
  legend.position="none") + xlab("Distance (m)") + ylab("Species richness")

# get numbers for alpha diversity metrics at Hansen
alpha.diversity <- estimate_richness(physeq_happy, measures=c("Observed","Shannon", "Simpson"))
head(alpha.diversity)

# combine phyloseq object with alpha diversity metrics 
data <- cbind(sample_data(physeq_happy), alpha.diversity)

# add depth as potential covariate
data$Depth <- sample_sums(physeq_happy)

# try linear model (assumptions likely not met)
mod1 <- lm(Observed~Distance, data=data) # SIGNIFICANT
mod2 <- lm(Shannon~Distance, data=data) # SIGNIFICANT
mod3 <- lm(Simpson~Distance, data=data) # SIGNIFICANT
summary(mod1)
summary(mod2)
summary(mod3)
library(multimode)
modetest(data$Shannon)
plot(mod1)

# Observed, Shannon and Simpsons' index all decrease with distance

# non-parametric regression is probably better, try the Kendall-Theil regression
library(mblm)
mod1 = mblm(Observed ~ Distance,data=data) # SIGNIFICANT
mod2 = mblm(Shannon ~ Distance,data=data) # SIGNIFICANT
mod3 = mblm(Simpson ~ Distance,data=data) # SIGNIFICANT
summary(mod1)
summary(mod2)
summary(mod3)

# Observed, Shannon and Simpson all decrease with distance

p <- plot_richness(physeq_happy, color="Distance", x="Distance", measures=c("Shannon"))
# boxplot with model prediction
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod2, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

ggplot(NULL, aes(x,y)) + 
  geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)+
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.1*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.1*upr), linetype="dotted")+ 
  theme(panel.background = element_blank(),panel.border = element_rect(colour = "black", 
  fill = NA, linewidth = 1.2)) + labs(y = "Shannon Diversity", x="Distance")

################################
# DISTANCE FROM BANK AT YAKO # 
################################

# filter phyloseq object by Happy stream
physeq_yako <- subset_samples(physeq, Stream=="Yako")

# plot Hansen stream specific alpha diversity metrics (separately due to space) by distance
p <- plot_richness(physeq_yako, color="Distance", x="Distance", measures=c("Observed"))

# boxplot
p + geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)

# or bar graph of means with CI for each category computes the mean confidence interval 
# for each category based on a bootstrapping method (a simulation technique to avoid 
# making an assumption about a normal distribution)
ggplot(data = p$data, aes(Distance, value, group=Distance,  fill="lightblue", fill=Distance)) + 
  stat_summary(fun.data=mean_sdl, geom="bar", position=position_dodge(width=0.95)) + 
  facet_wrap(~variable, ncol=3, scales="free") + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", 
  position=position_dodge(width=0.95), width=0.3) + theme_bw() + 
  scale_y_continuous(limits=c(0,350),expand = c(0, 0)) + theme( strip.text.x = element_blank(),
  legend.position="none") + xlab("Distance (m)") + ylab("Species Richness")


# get numbers for alpha diversity metrics at Hansen
alpha.diversity <- estimate_richness(physeq_yako, measures=c("Observed","Shannon", "Simpson"))
head(alpha.diversity)

# combine phyloseq object with alpha diversity metrics 
data <- cbind(sample_data(physeq_yako), alpha.diversity)

# add depth as potential covariate
data$Depth <- sample_sums(physeq_yako)

# try linear model (assumptions likely not met)
mod1 <- lm(Observed~Distance, data=data) # SIGNIFICANT
mod2 <- lm(Shannon~Distance, data=data) # SIGNIFICANT
mod3 <- lm(Simpson~Distance, data=data)
summary(mod1)
summary(mod2)
summary(mod3)
modetest(data$Shannon)
plot(mod1)

# Observed, Shannon index all decrease with distance

# non-parametric regression is probably better, try the Kendall-Theil regression
library(mblm)
mod1 = mblm(Observed ~ Distance,data=data) # SIGNIFICANT
mod2 = mblm(Shannon ~ Distance,data=data) # SIGNIFICANT
mod3 = mblm(Simpson ~ Distance,data=data) # SIGNIFICANT
summary(mod1)
summary(mod2)
summary(mod3)

# Observed, Shannon and Simpson all decrease with distance!

p <- plot_richness(physeq_yako, color="Distance", x="Distance", measures=c("Shannon"))
# boxplot with model prediction
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod2, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

ggplot(NULL, aes(x,y)) + 
  geom_boxplot(data = p$data, aes(x = Distance, y = value, group=Distance, color = NULL), alpha = 0.1)+
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.05*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.05*upr), linetype="dotted")+ 
  theme(panel.background = element_blank(),panel.border = element_rect(colour = "black", 
  fill = NA, linewidth = 1.2)) + labs(y = "Shannon Diversity", x="Distance")
